# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.1063.1.11 -> 1.1063.1.12 # include/linux/mmzone.h 1.10 -> 1.11 # mm/page_alloc.c 1.57 -> 1.58 # init/main.c 1.28 -> 1.29 # mm/bootmem.c 1.10 -> 1.11 # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 03/08/12 agruen@suse.de 1.1063.4.24 # [PATCH] More steal_locks fixes: we should be in full LSB compliance now # # By Andreas and Hurbert Xu # # Index: linux-2.4.22-rc2.orig/fs/exec.c # =================================================================== # -------------------------------------------- # 03/08/12 bjorn.helgaas@hp.com 1.1069.1.10 # ia64: Fix check for binutils that supports "hint" instructions. # -------------------------------------------- # 03/08/12 bjorn.helgaas@hp.com 1.1074 # Merge hp.com:/home/helgaas/linux/to-marcelo-2.4 # into hp.com:/home/helgaas/linux/linux-ia64-2.4 # -------------------------------------------- # 03/08/12 bjorn.helgaas@hp.com 1.1075 # Merge hp.com:/home/helgaas/linux/ia64-extras # into hp.com:/home/helgaas/linux/linux-ia64-2.4 # -------------------------------------------- # 03/08/12 steiner@SGI.com 1.1063.1.12 # discontig/NUMA support # # Attached is the patch for discontig memory for 2.4.21. This patch # has been tested on the ZX1 & NEC platforms & appears to work ok. It # also works on SN2 but there are additional patches (unrelated to # discontig) that at still needed in 2.4.21. # -------------------------------------------- # diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h --- a/include/linux/mmzone.h Wed Oct 8 09:09:59 2003 +++ b/include/linux/mmzone.h Wed Oct 8 09:09:59 2003 @@ -8,6 +8,12 @@ #include #include #include +#ifdef CONFIG_DISCONTIGMEM +#include +#endif +#ifndef MAX_NUMNODES +#define MAX_NUMNODES 1 +#endif /* * Free memory management - zoned buddy allocator. @@ -110,7 +116,7 @@ * footprint of this construct is very small. */ typedef struct zonelist_struct { - zone_t * zones [MAX_NR_ZONES+1]; // NULL delimited + zone_t * zones [MAX_NUMNODES*MAX_NR_ZONES+1]; // NULL delimited } zonelist_t; #define GFP_ZONEMASK 0x0f @@ -144,8 +150,8 @@ extern int numnodes; extern pg_data_t *pgdat_list; -#define memclass(pgzone, classzone) (((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \ - && ((pgzone) <= (classzone))) +#define memclass(pgzone, classzone) (((pgzone) - (pgzone)->zone_pgdat->node_zones) <= \ +((classzone) - (classzone)->zone_pgdat->node_zones)) /* * The following two are not meant for general usage. They are here as @@ -212,6 +218,18 @@ #define for_each_zone(zone) \ for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) +#ifdef CONFIG_NUMA +#define MAX_NR_MEMBLKS BITS_PER_LONG /* Max number of Memory Blocks */ +#include +#else /* !CONFIG_NUMA */ +#define MAX_NR_MEMBLKS 1 +#endif /* CONFIG_NUMA */ + +/* Returns the number of the current Node. */ + +#ifndef CONFIG_NUMA +#define numa_node_id() (__cpu_to_node(smp_processor_id())) +#endif #ifndef CONFIG_DISCONTIGMEM diff -Nru a/init/main.c b/init/main.c --- a/init/main.c Wed Oct 8 09:09:59 2003 +++ b/init/main.c Wed Oct 8 09:09:59 2003 @@ -290,6 +290,7 @@ extern void setup_arch(char **); +extern void __init build_all_zonelists(void); extern void cpu_idle(void); unsigned long wait_init_idle; @@ -360,6 +361,7 @@ lock_kernel(); printk(linux_banner); setup_arch(&command_line); + build_all_zonelists(); printk("Kernel command line: %s\n", saved_command_line); parse_options(command_line); trap_init(); diff -Nru a/mm/bootmem.c b/mm/bootmem.c --- a/mm/bootmem.c Wed Oct 8 09:09:59 2003 +++ b/mm/bootmem.c Wed Oct 8 09:09:59 2003 @@ -49,8 +49,24 @@ bootmem_data_t *bdata = pgdat->bdata; unsigned long mapsize = ((end - start)+7)/8; - pgdat->node_next = pgdat_list; - pgdat_list = pgdat; + + /* + * sort pgdat_list so that the lowest one comes first, + * which makes alloc_bootmem_low_pages work as desired. + */ + if (!pgdat_list || pgdat_list->node_start_paddr > pgdat->node_start_paddr) { + pgdat->node_next = pgdat_list; + pgdat_list = pgdat; + } else { + pg_data_t *tmp = pgdat_list; + while (tmp->node_next) { + if (tmp->node_next->node_start_paddr > pgdat->node_start_paddr) + break; + tmp = tmp->node_next; + } + pgdat->node_next = tmp->node_next; + tmp->node_next = pgdat; + } mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); @@ -259,16 +275,16 @@ if (!bdata->node_bootmem_map) BUG(); count = 0; + page = virt_to_page(phys_to_virt(bdata->node_boot_start)); idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); for (i = find_first_zero_bit(bdata->node_bootmem_map, idx); i < idx; i = find_next_zero_bit(bdata->node_bootmem_map, idx, i + 1)) { - page = pgdat->node_mem_map + i; count++; - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); + ClearPageReserved(page+i); + set_page_count(page+i, 1); + __free_page(page+i); } total += count; diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c --- a/mm/page_alloc.c Wed Oct 8 09:09:59 2003 +++ b/mm/page_alloc.c Wed Oct 8 09:09:59 2003 @@ -586,13 +586,44 @@ /* * Builds allocation fallback zone lists. */ -static inline void build_zonelists(pg_data_t *pgdat) +static int __init build_zonelists_node(pg_data_t *pgdat, zonelist_t *zonelist, int j, int k) { - int i, j, k; + zone_t *zone; + switch (k) { + default: + BUG(); + /* + * fallthrough: + */ + case ZONE_HIGHMEM: + zone = pgdat->node_zones + ZONE_HIGHMEM; + if (zone->memsize) { +#ifndef CONFIG_HIGHMEM + BUG(); +#endif + zonelist->zones[j++] = zone; + } + case ZONE_NORMAL: + zone = pgdat->node_zones + ZONE_NORMAL; + if (zone->memsize) + zonelist->zones[j++] = zone; + case ZONE_DMA: + zone = pgdat->node_zones + ZONE_DMA; + if (zone->memsize) + zonelist->zones[j++] = zone; + } + + return j; +} + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + local_node = pgdat->node_id; + printk("Building zonelist for node : %d\n", local_node); for (i = 0; i <= GFP_ZONEMASK; i++) { zonelist_t *zonelist; - zone_t *zone; zonelist = pgdat->node_zonelists + i; memset(zonelist, 0, sizeof(*zonelist)); @@ -604,33 +635,32 @@ if (i & __GFP_DMA) k = ZONE_DMA; - switch (k) { - default: - BUG(); - /* - * fallthrough: - */ - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->memsize) { -#ifndef CONFIG_HIGHMEM - BUG(); -#endif - zonelist->zones[j++] = zone; - } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->memsize) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->memsize) - zonelist->zones[j++] = zone; - } + j = build_zonelists_node(pgdat, zonelist, j, k); + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < numnodes; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + for (node = 0; node < local_node; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + zonelist->zones[j++] = NULL; } } +void __init build_all_zonelists(void) +{ + int i; + + for(i = 0 ; i < numnodes ; i++) + build_zonelists(NODE_DATA(i)); +} + /* * Helper functions to size the waitqueue hash table. * Essentially these want to choose hash table sizes sufficiently @@ -742,7 +772,7 @@ MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); } *gmap = pgdat->node_mem_map = lmem_map; - pgdat->node_size = totalpages; + pgdat->node_size = 0; pgdat->node_start_paddr = zone_start_paddr; pgdat->node_start_mapnr = (lmem_map - mem_map); pgdat->nr_zones = 0; @@ -766,6 +796,7 @@ zone->zone_pgdat = pgdat; zone->free_pages = 0; zone->need_balance = 0; + pgdat->node_size += realsize; if (!size) continue; @@ -850,7 +881,6 @@ (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); } } - build_zonelists(pgdat); } void __init free_area_init(unsigned long *zones_size)